home *** CD-ROM | disk | FTP | other *** search
Text File | 1987-11-03 | 12.3 KB | 402 lines | [TEXT/KAHL] |
- /* words to handle subset (proximity filter) tasks....
- * ^z -- 870810-13-....
- *
- * Subset (or subindex) restrictions are useful in doing searches for
- * combinations of terms, phrases, etc., where the terms are themselves
- * common words. Subsets also become indispensible when working in
- * very large databases, where even slightly-exotic terms occur too many
- * times for convenient browsing through their CONTEXT displays.
- *
- * When a subset has been defined, then instead of showing an INDEX
- * item as something like:
- * 3141 UNIX
- * that item is displayed with a count of how many occurrences are
- * 'valid' in addition to the total number of occurrences:
- * 27/3141 UNIX
- * Thus, in this case only 27 out of the 3141 appearances of the term
- * 'UNIX' happened to be in the neighborhood of the set of words chosen
- * by the user to define the current working subset.
- *
- * The 'neighborhood' (a half dozen words or so on each side) of a chosen
- * index term is added (logical OR) into the working subset by giving
- * the '&' command when that term's INDEX display is the current item.
- * If a larger neighborhood (half a dozen sentences or so each way) is
- * preferable, use the '&&' command; for a still larger neighborhood
- * (a few paragraphs on each side), use the '&&&' command.
- *
- *
- * IMPLEMENTATION NOTES:
- *
- * Subest searching is handled by an array of flag bits -- one bit
- * for each chunk of NEIGHBORHOOD bytes in the original text. Bits are
- * held in the array subset[]; they are zero for 'invalid' regions
- * of the original document, and one for 'valid' regions that have
- * been defined by proximity to index terms selected by the user.
- *
- * A value of 32 for NEIGHBORHOOD seems to work very well for defining
- * proximity. The compression factor from the original document to the
- * array subset[] is NEIGHBORHOOD*8 = 256 in that case. So, even for a
- * document that is tens of megabytes long, the subset[] array only
- * requires a few hundred kB and it is quite feasible to hold it in
- * memory.
- *
- * The presence of a subset is signalled when the global array pointer
- * 'subset' is non-NULL ... in which case show_current_index_item,
- * make_move, and other words modify their behavior in order to
- * reflect the chosen subset. If the subset is completely
- * empty, after having just been created or flushed, then the
- * global variable empty_subset is nonzero (true) and short-cuts
- * are taken in counting and displaying words....
- *
- * Consider a single occurrence of a word. Suppose we want to add the
- * immediate neighborhood of that word to our working subset. To avoid
- * edge-effects due to the chunkiness of the definition of neighborhood,
- * the routine to set bits in subset[] actually sets 2*WORD_RANGE+1 bits:
- * the bit corresponding to the chunk wherein the chosen word falls,
- * and WORD_RANGE extra bits on each side.
- *
- * Thus, for the choice WORD_RANGE = 1, any index item which is
- * within NEIGHBORHOOD (=32) bytes of our chosen word is certain
- * to be included in the proximity subset; any item which occurs
- * at least 2*NEIGHBORHOOD (=64) bytes away is certain NOT to be
- * included, and items in the intermediate range have a
- * linearly-decreasing probability of false inclusion. The choice
- * of 32 for NEIGHBORHOOD and 1 for WORD_RANGE therefore gives
- * essentially all items within half a dozen or so words
- * (average word length is 5-6 letters) of the term used to
- * define the subset....
- *
- * In extensive experimentation with the MacForth-based Browser v.244+
- * program on the Macintosh, the above definition of proximity seemed
- * to be by far the most useful one in real life. It is also quite
- * straightforward to implement with good efficiency. In addition
- * to the 'word'-neighborhood proximity (within WORD_RANGE chunks),
- * it is occasionally convenient to add a somewhat larger
- * neighborhood to the working subset -- corresponding to proximity
- * within a few sentences or even a few paragraphs. To do that,
- * the && (sentence-proximity) and &&& (paragraph-proximity) commands
- * simply set more bits on either side of the word's location in
- * subset[]. Specifically, && sets SENTENCE_RANGE bits on each side,
- * and &&& sets PARAGRAPH_RANGE bits on each side. Values of 10 and 100
- * respectively for those parameters seem to work quite well.
- */
-
-
- #include <stdio.h> /* for FILE, printf(), etc. */
- #include <storage.h> /* for calloc(), free(), etc. */
- #include <unix.h> /* for exit(), etc. */
- #include <proto.h> /* for function prototypes */
- #include "brwsr.h" /* for various definitions */
- #include "brwsr.proto.h" /* for my function prototypes */
-
-
- /* Handle the * command to empty out the working subset before
- * beginning to do proximity-filtered searching (that is, to create
- * the array subset[] in empty form).
- * Also handle the ** command to destroy subset[] (which makes it
- * seem as if it is "full" from the user's viewpoint)
- *
- * Force the user back to INDEX level when the working subindex
- * is emptied or filled, to avoid inconsistencies in moving around
- * the CONTEXT level....
- */
-
- void do_make_subset (cmd)
- char cmd[];
- {
- void beep (), create_subset (), destroy_subset (),
- show_current_item ();
- extern int level;
-
- if (cmd[1] == '\0')
- create_subset ();
- else if (cmd[1] == '*')
- destroy_subset ();
- else
- beep ();
-
- level = INDEX;
- show_current_item ();
- }
-
-
- /* Do the job of creating the empty array subset[] ... note that if
- * running on a machine with 16-bit int variables such as Lightspeed C
- * on the Mac, the calloc() function won't work for files bigger
- * than 16 MB or so (for NEIGHBORHOOD = 32) ....must modify this
- * to use clalloc(), the non-ANSI standard function for allocating
- * and clearing bigger memory chunks. Hence, the #ifdef LIGHTSPEED
- * lines below....
- *
- * Note that there is no need to provide a "No file open!" error msg
- * since show_current_item() in calling function will do that for
- * us....
- */
-
- void create_subset ()
- {
- extern char* subset;
- extern FILE *doc_file;
- extern int empty_subset;
- extern long max_item[];
- #ifdef LIGHTSPEED
- char *clalloc ();
- #else
- char *calloc ();
- #endif
- void beep (), destroy_subset ();
-
- if (doc_file == NULL)
- return;
-
- destroy_subset ();
- #ifdef LIGHTSPEED
- if ((subset = clalloc ((unsigned long)(1 +
- max_item[TEXT]/(NEIGHBORHOOD*8)), (long)sizeof(char))) == NULL)
- #else
- if ((subset = calloc ((unsigned int)(1 +
- max_item[TEXT]/(NEIGHBORHOOD*8)), sizeof(char))) == NULL)
- #endif
- {
- beep ();
- printf ("Not enough memory for subset!\n");
- return;
- }
-
- empty_subset = TRUE;
- }
-
-
- /* routine to destroy a subset (used in response to a ** command,
- * or when changing over to a new file with a : command, or when about
- * to create a new empty subset inside the * command)
- */
-
- void destroy_subset ()
- {
- extern char *subset;
-
- if (subset != NULL)
- free (subset);
- subset = NULL;
- }
-
-
- /* routine to invert the working subindex (response to a ~ command) --
- * perform a logical NOT on the entire set, so that what was once a
- * member of the set becomes a non-member, and vice versa....
- *
- * Must set empty_subset FALSE since we don't a priori know that the
- * resulting set is empty or full or what....
- *
- * As with * and ** commands, kick the user back to the INDEX level
- * when this command is executed, for safety's sake....
- */
-
- void do_invert_subset ()
- {
- register long i, lim;
- extern char *subset;
- extern long max_item[];
- extern int empty_subset;
- void beep();
-
- if (doc_file == NULL)
- {
- beep ();
- printf ("No file open!\n");
- return;
- }
- if (subset == NULL)
- {
- beep ();
- printf ("No subset!\n");
- return;
- }
-
- lim = max_item[TEXT]/(NEIGHBORHOOD*8);
- for (i = 0; i <= lim; ++i)
- subset[i] = ~subset[i];
-
- empty_subset = FALSE;
- level = INDEX;
- show_current_item ();
- }
-
-
- /* handle the '&' command to add the current word's neighborhood to
- * the working subset:
- * & adds a few-words neighborhood
- * && adds a few-sentences neighborhood
- * &&& adds a few-paragraphs neighborhood
- *
- * Note that instead of calling show_current_item() to finish up
- * this function, since we know that every occurrence of the
- * current item will be in the working subset we can save time
- * and avoid re-counting them all ... hence, the final lines
- * of this routine
- */
-
- void do_add_neighborhood (cmd)
- char cmd[];
- {
- int nsize;
- extern int level;
- extern char *subset;
- extern FILE *doc_file, *notes_file;
- void beep (), set_neighborhood_bits (), get_key_rec ();
- extern KEY_REC this_rec, prev_rec;
- extern long current_item[], subset_rec_count;
-
- if (doc_file == NULL)
- {
- beep ();
- printf ("No file open!\n");
- return;
- }
- if (subset == NULL)
- {
- beep ();
- printf ("No subset!\n");
- return;
- }
-
- nsize = WORD_RANGE;
- if (cmd[1] == '&')
- {
- nsize = SENTENCE_RANGE;
- if (cmd[2] == '&')
- nsize = PARAGRAPH_RANGE;
- }
-
- level = INDEX;
- set_neighborhood_bits (nsize);
- empty_subset = FALSE;
- get_key_rec (&prev_rec, current_item[INDEX] - 1);
- get_key_rec (&this_rec, current_item[INDEX]);
- subset_rec_count = this_rec.ccount - prev_rec.ccount;
- printf ("%6ld/%-6ld %.28s\n", subset_rec_count, subset_rec_count,
- this_rec.kkey);
- if (notes_file != NULL)
- fprintf (notes_file, "%6ld/%-6ld %.28s\n", subset_rec_count,
- subset_rec_count, this_rec.kkey);
- }
-
-
- /* function to set n bits on each side of each current index term in the
- * subset[] array ... note that it is important to set min_item[CONTEXT]
- * and max_item[CONTEXT] values before using them, since they are only
- * set ordinarily when descending ('=' command) into the CONTEXT level...
- * values of prev_rec and this_rec are set every time an index item
- * is displayed, so they will be all right for us already....
- */
-
- void set_neighborhood_bits (n)
- register int n;
- {
- register long i;
- register PTR_REC j, j0, j1;
- extern long min_item[], max_item[];
- extern KEY_REC prev_rec, this_rec;
- PTR_REC get_ptr_rec ();
- void set_subset_bit ();
-
- min_item[CONTEXT] = prev_rec.ccount;
- max_item[CONTEXT] = this_rec.ccount - 1;
-
- if ((this_rec.ccount-prev_rec.ccount) * n > GIVE_WARNING)
- printf ("Please wait (%ld bits to set)...\n",
- (this_rec.ccount - prev_rec.ccount) * n * 2 + 1);
-
- for (i = min_item[CONTEXT]; i <= max_item[CONTEXT]; ++i)
- {
- j = get_ptr_rec (i);
- j0 = j - n * NEIGHBORHOOD;
- if (j0 < min_item[TEXT])
- j0 = min_item[TEXT];
- j1 = j + n * NEIGHBORHOOD;
- if (j1 > max_item[TEXT])
- j1 = max_item[TEXT];
- for (j = j0; j <= j1; j += NEIGHBORHOOD)
- set_subset_bit (j);
- }
- }
-
-
- /* function to set a bit in the array subset[] for a chosen character
- * offset from the start of the file ... just convert the offset into
- * byte and bit numbers and then "OR" the 1 into that slot....
- *
- * (An attempt to optimize for the specific value NEIGHBORHOOD = 32,
- * using & and >> in place of % and /, did not yield a significant
- * improvement in speed... ^z 870813)
- */
-
- void set_subset_bit (offset)
- long offset;
- {
- int bit_no;
- long byte_no;
- extern char *subset;
-
- bit_no = (offset % (8 * NEIGHBORHOOD)) / NEIGHBORHOOD;
- byte_no = offset / (8 * NEIGHBORHOOD);
-
- subset[byte_no] |= 1 << bit_no;
- }
-
-
- /* function to return the value of a bit in the array subset[] for a
- * chosen character offset from the start of the document file ...
- * as in set_subset_bit() above, just convert offset into byte and
- * bit numbers, extract the relevant bit, shift it down, and return
- * it....
- *
- * (An attempt to optimize for the specific value NEIGHBORHOOD = 32,
- * using & and >> in place of % and /, did not yield a significant
- * improvement in speed... ^z 870813)
- */
-
- int get_subset_bit (offset)
- long offset;
- {
- int bit_no;
- long byte_no;
- extern char *subset;
-
- bit_no = (offset % (8 * NEIGHBORHOOD)) / NEIGHBORHOOD;
- byte_no = offset / (8 * NEIGHBORHOOD);
-
- return ((subset[byte_no] >> bit_no) & 1);
- }
-
-
- /* function to count and return the number of occurrences of this_rec
- * in the working subindex (i.e., the number of times the word comes
- * up with its subset[] bit turned ON) ....
- */
-
- long count_subset_recs (this_recp, prev_recp)
- KEY_REC *this_recp, *prev_recp;
- {
- register long i, n;
- extern int empty_subset;
- PTR_REC get_ptr_rec ();
- int get_subset_bit ();
-
- if (empty_subset)
- return (0L);
-
- if (this_recp->ccount - prev_recp->ccount > GIVE_WARNING)
- printf ("Please wait (%ld words to check)...\n",
- this_recp->ccount - prev_recp->ccount);
-
- for (n = 0, i = prev_recp->ccount; i < this_recp->ccount; ++i)
- n += get_subset_bit ((long)get_ptr_rec (i));
-
- return (n);
- }
-
-
-
-